#!/usr/bin/env bash

# Slurm batch queue:
# Show the current batch jobs status broken down into userids.
# Can also show per-user or per-account status.

# Author: Ole H. Nielsen, Technical University of Denmark
# E-mail: Ole.H.Nielsen@fysik.dtu.dk
# Home page: https://github.com/OleHolmNielsen/Slurm_tools

# Command usage:
function usage()
{
cat <<EOF
Usage: $0 [-u username] [-a account] [-p partition] [-q QOS] [-A] [-C] [-h]
where:
        -u username: Print only user <username>
        -a account: Print only jobs in Slurm account <account>
	-A: Print only ACCT_TOTAL lines
	-C: Print comma separated lines for Excel
        -p partition: Print only jobs in partition <partition-list>
        -q qos-list: Print only jobs in QOS <qos-list>
        -r: Print additional job Reason columns
        -h: Print this help information
EOF
}

# Set the maximum length of user and account names (a minimum of 11 to fit the *_TOTAL strings)
export maxlu=11		# Usernames
export maxla=7		# Account names
# Set the percentage of MaxJobCount above which we print a warning.
export maxjobpercent=80

# Temporary file for headers
TMPFILE=/tmp/showuserjobs.$$
rm -f $TMPFILE

#
# Process command arguments
#
export account_totals_only=0
export comma_separated=0
export SORT_SEP=""
export reason_print=0
export account=""

while getopts "u:a:p:q:ACrh" options; do
	case $options in
		u ) 	export username=$OPTARG
			echo Select only user $username >> $TMPFILE
			;;
		a ) 	export account=$OPTARG
			echo Select only users in account $account >> $TMPFILE
			;;
		A ) 	export account_totals_only=1
			echo Select only ACCT_TOTAL lines >> $TMPFILE
			;;
		C ) 	export comma_separated=1
			export SORT_SEP="-t ,"
			;;
		p ) 	export partition="-p $OPTARG"
			if test -z "`sinfo -ho %P $partition`"
			then
				echo "ERROR: Partition $OPTARG does not exist on this cluster"
				exit 1
			fi
			echo Print only jobs in partition $OPTARG >> $TMPFILE
			;;
		q ) 	export qoslist="-q $OPTARG"
			if test -z "`sacctmgr -n show qos $OPTARG`"
			then
				echo "ERROR: QOS $OPTARG does not exist on this cluster"
				exit 1
			fi
			echo Print only jobs in QOS $OPTARG >> $TMPFILE
			;;
		r ) 	export reason_print=1
			echo "Print additional job Reason columns (see 'man squeue'):" >> $TMPFILE
			echo "    Priority:   Waiting for resources or higher priority jobs" >> $TMPFILE
			echo "    Dependency: Waiting for a dependent job to complete" >> $TMPFILE
			echo "    CpuLimit:   AssocGrpCpuLimit, AssocGrpCPURunMinutesLimit" >> $TMPFILE
			echo "    Held:       JobHeldUser, JobHeldAdmin" >> $TMPFILE
			;;
		h|? ) usage
			exit 1;;
		* ) usage
			exit 1;;
	esac
done

# WARNING: GNU gawk version 4.0 or later is required for arrays of arrays
awk_version=`awk --version | head -1 | awk '{version=$3; split(version,v,"."); print v[1]}'`
if test "$awk_version" = "3"
then
	echo -n "Sorry, gawk version 4.0 or later is required.  Your version is: "
	awk --version | head -1
	exit 1
fi

# Test for extraneous command line arguments
if test $# -gt $(($OPTIND-1))
then
	echo ERROR: Too many command line arguments: $*
	usage
	exit 1
fi

SYSTEMNAME=`scontrol show config | grep ClusterName | awk '{print $3}'`
echo Batch job status for cluster $SYSTEMNAME at `date` >> $TMPFILE

#
# Print a nodes state summary
#

# Count the nodes and CPUs:
# Output: Nodename cpus partition state
sinfo --noheader -N  $partition -o  "%N %c %P %6t" | awk '
{
	node = $1
	if (hostname[node] != "") next		# Nodes in multiple partitions are counted once only
	hostname[node] = node
	total_node++
	cpus[node] = $2
	total_cpus += cpus[node]
	partition[node] = $3
	s = $4
	gsub("*", "", s)    # Strip "*" from nodename in default partition
	state[node] = s
	nodestates[s] = s
	nodecount[s]++
	cpucount[s] += cpus[node]
} END {
	print " "
	print "Node states summary:"
	format = "%-7s %5d nodes (%d CPUs)\n"

	# Sort arrays by element values:
	# https://www.gnu.org/software/gawk/manual/html_node/Controlling-Scanning.html
	PROCINFO["sorted_in"] = "@val_type_asc"

	for (i in nodestates) {
		s =  nodestates[i]
		printf(format, s, nodecount[s],  cpucount[s])
	}
	printf(format, "Total", total_node, total_cpus)
}' >> $TMPFILE

#
# Print a job summary
#
export NUMJOBS=`squeue -h -o "%i" | wc -l`
export MAXJOBCOUNT=`scontrol show config | grep MaxJobCount | awk '{print $3}'`
echo >> $TMPFILE
echo "Job summary: $NUMJOBS jobs total (max=$MAXJOBCOUNT) in all partitions." >> $TMPFILE
# Check if NUMJOBS exceeds 80% of the maximum:
if (( NUMJOBS > maxjobpercent*MAXJOBCOUNT/100 ))
then
	echo "*** WARNING: *** The number of jobs is getting close to the MaxJobCount limit in slurm.conf." >> $TMPFILE
fi

# Calculate the maximum length of all usernames and account names for proper formatting below
eval `sacctmgr -snrp show user format="User,Account" | awk -F'|' '
BEGIN {
	maxlu = ENVIRON["maxlu"]
	maxla = ENVIRON["maxla"]
}
{
	if (length($1) > maxlu) maxlu = length($1)
	if (length($2) > maxla) maxla = length($2)
} END {
	print "export maxlu=" maxlu
	print "export maxla=" maxla
}'`

# Print a header
if test $comma_separated -ne 1
then
	cat $TMPFILE
	rm -f $TMPFILE
fi

echo | awk '
BEGIN {
	maxlu=ENVIRON["maxlu"]
	maxla=ENVIRON["maxla"]
	comma_separated=ENVIRON["comma_separated"]
	reason_print=ENVIRON["reason_print"]
	line="============================="
	if (comma_separated!=1)
		print " "
	if (reason_print==0) {
		# Output line format string
		if (comma_separated==1)
			fmt="%s,%s,%s,%s,%s,%s,%s\n"
		else
			fmt = sprintf("%%-%d.%ds %%-%d.%ds %%6.6s %%6.6s   %%6.6s %%6.6s  %%s\n",
				maxlu, maxlu, maxla, maxla)
		printf (fmt, "Username/", "", "Running", "", "Idle", "", "")
		printf (fmt, "Totals", "Account", "Jobs", "CPUs", "Jobs", "CPUs", "Further info")
		if (comma_separated!=1)
			printf (fmt, line, line, line, line, line, line, line)
	} else {
		# Additional columns for job reasons
		if (comma_separated==1)
			reasonfmt="%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"
		else
			reasonfmt = sprintf("%%-%d.%ds %%-%d.%ds %%6.6s %%6.6s   %%6.6s %%6.6s   %%6.6s %%6.6s   %%6.6s %%6.6s   %%6.6s %%6.6s   %%6.6s %%6.6s  %%s\n",
				maxlu, maxlu, maxla, maxla)
		printf (reasonfmt, "Username/", "", "Running", "", "Idle", "",
			"Priority", "", "Dependency", "", "CpuLimit", "", "Held", "", "")
		printf (reasonfmt, "Totals", "Account", 
			"Jobs", "CPUs", "Jobs", "CPUs",
			"Jobs", "CPUs", "Jobs", "CPUs", "Jobs", "CPUs", "Jobs", "CPUs",
			"Further info")
		if (comma_separated!=1)
			printf (reasonfmt, line, line, line, line, line,
				line, line, line, line, line, line, line, line,
				line, line)
	}
}'

# The squeue -o format selects fields we are interested in:
# Userid Account Jobstatus Nodecount CPUcount Partition Reason
squeue_fmt="%u %a %.10T %.6D %.6C %P %r"
squeue $partition $qoslist --noheader -o "$squeue_fmt" | awk '
BEGIN {
	uname=ENVIRON["username"]
	gname=ENVIRON["account"]
	maxlu=ENVIRON["maxlu"]
	maxla=ENVIRON["maxla"]
	comma_separated=ENVIRON["comma_separated"]
	account_totals_only=ENVIRON["account_totals_only"]
	reason_print=ENVIRON["reason_print"]
	# Then get the list of user full names from passwd lines
	while ("getent passwd" | getline ) {
		split($0,b,":")		# Split password line into fields
		username[b[1]] = b[1]	# Username b[1]
		fullname[b[1]] = b[5]	# Full name b[5] of this username (b[1])
	}
	close("getent passwd")
	if (length(uname)>0) {		# Select username
		for (u in username) {
			if (u == uname) userselect=1	# User found
		}
		if (userselect != 1) {
			printf("ERROR: Username %s is not in password file\n", uname)
			errorexit = -1
		}
	}
	if (errorexit != 0) exit errorexit	# Will jump to END section
}

{
	# Process the job list
	userid=$1
	account=$2
	status=$3
	nnodes=$4
	nprocs=$5
	partition=$6
	reason=$7
	if (userselect > 0 && uname !~ userid) next	# Select specified username only
	userlist[userid][account] = userid
	accountlist[userid][account] = account

	if (status == "RUNNING" ) {
		running[userid][account]++
		totalrun++
		runprocs[userid][account] += nprocs
		totalrunprocs += nprocs
	} else {
		# PENDING jobs
		idle[userid][account]++
		idleprocs[userid][account] += nprocs
		totalidle++
		totalidleprocs += nprocs
		if (reason_print == 1) {
			if (reason == "Dependency") {
				Dependency[userid][account]++
				Dependency_procs[userid][account] += nprocs
				totalDependency++
				totalDependency_procs += nprocs
			} else if (reason == "Resources" || reason == "Priority") {
				Priority[userid][account]++
				Priority_procs[userid][account] += nprocs
				totalPriority++
				totalPriority_procs += nprocs
			} else if (reason == "AssocGrpCpuLimit" || reason == "AssocGrpCPURunMinutesLimit") {
				CpuLimit[userid][account]++
				CpuLimit_procs[userid][account] += nprocs
				totalCpuLimit++
				totalCpuLimit_procs += nprocs
			} else if (reason == "JobHeldUser" || reason == "JobHeldAdmin") {
				Held[userid][account]++
				Held_procs[userid][account] += nprocs
				totalHeld++
				totalHeld_procs += nprocs
			}
		}
	}
	
} END {
	if (errorexit != 0) exit errorexit	# Error encountered
	# Output line format string
	if (comma_separated==1)
		fmt = "%s,%s,%d,%d,%d,%d,%s\n"
	else
		fmt = sprintf("%%-%d.%ds %%-%d.%ds %%6d %%6d   %%6d %%6d  %%s\n",
			maxlu, maxlu, maxla, maxla)
	# Additional columns for job reasons
	if (comma_separated==1)
		reasonfmt = "%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%s\n"
	else
		reasonfmt = sprintf("%%-%d.%ds %%-%d.%ds %%6d %%6d   %%6d %%6d   %%6d %%6d   %%6d %%6d   %%6d %%6d   %%6d %%6d  %%s\n",
			maxlu, maxlu, maxla, maxla)

	for (u in userlist) {
		for (a in userlist[u]) {
			userid = userlist[u][a]
			account = accountlist[u][a]
			if (length(gname)>0 && gname !~ account) continue # Proceed for specified account only
			totalusers++
			if (account_totals_only==0) {
				if (reason_print == 0)
					printf(fmt, 
					userid, account, running[u][a], runprocs[u][a], idle[u][a], idleprocs[u][a],
					fullname[userid])
				else
					printf(reasonfmt, 
					userid, account, running[u][a], runprocs[u][a], idle[u][a], idleprocs[u][a],
					Priority[u][a], Priority_procs[u][a], Dependency[u][a], Dependency_procs[u][a],
					CpuLimit[u][a], CpuLimit_procs[u][a], Held[u][a], Held_procs[u][a],
					fullname[userid])
			}
			# Accumulate account statistics
			accounts[account] = account
			accountusers[account]++
			account_running[account] += running[u][a]
			account_runprocs[account] += runprocs[u][a]
			account_idle[account] += idle[u][a]
			account_idleprocs[account] += idleprocs[u][a]
			if (reason_print == 1) {
				account_Dependency[account] += Dependency[u][a]
				account_Dependency_procs[account] += Dependency_procs[u][a]
				account_Priority[account] += Priority[u][a]
				account_Priority_procs[account] += Priority_procs[u][a]
				account_CpuLimit[account] += CpuLimit[u][a]
				account_CpuLimit_procs[account] += CpuLimit_procs[u][a]
				account_Held[account] += Held[u][a]
				account_Held_procs[account] += Held_procs[u][a]
			}
		}
	}
	if (userselect > 0) exit	# Finished (username specified)

	# Account info format string
	accountfmt = "Running+Idle=%d CPUs, %d users"
	for (acct in accounts) {
		accountinfo = sprintf(accountfmt, account_runprocs[acct]+account_idleprocs[acct], accountusers[acct])
		if (reason_print == 0)
			printf(fmt, 
			"ACCT_TOTAL", acct, account_running[acct], account_runprocs[acct], account_idle[acct], account_idleprocs[acct],
			accountinfo)
		else
			printf(reasonfmt, 
			"ACCT_TOTAL", acct, account_running[acct], account_runprocs[acct], account_idle[acct], account_idleprocs[acct],
			account_Priority[acct], account_Priority_procs[acct], account_Dependency[acct], account_Dependency_procs[acct],
			account_CpuLimit[acct], account_CpuLimit_procs[acct], account_Held[acct], account_Held_procs[acct],
			accountinfo)
	}
	if (length(gname) > 0) exit	# Finished (account specified)

	accountinfo = sprintf(accountfmt, totalrunprocs+totalidleprocs, totalusers)
	if (reason_print == 0)
		printf(fmt, 
		"GRAND_TOTAL", "ALL", totalrun, totalrunprocs, totalidle, totalidleprocs, 
		accountinfo)
	else
		printf(reasonfmt, 
		"GRAND_TOTAL", "ALL", totalrun, totalrunprocs, totalidle, totalidleprocs, 
		totalPriority, totalPriority_procs, totalDependency, totalDependency_procs,
		totalCpuLimit, totalCpuLimit_procs, totalHeld, totalHeld_procs,
		accountinfo)
}' | env LC_ALL=C sort $SORT_SEP -r -n -k 4 -k 6 -k 1d

# The sort command sorts number of running procs in descending order on keys 4,6 and alphabetical sort on key 1
# The LC_ALL=C ensures that Upper case is sorted before lower case.
